
from init import PATHS
import logging
LOGGER = logging.getLogger(__name__)
import pandas as pd
from C_PatentVariables import collecting_patstat, family_aggregation, firm_year_aggregation


def get_list_of_patent_ids(list_bvdids, namefile):
    LOGGER.info('get_list_of_patent_ids')
    # IMPORT PATSTAT ORBIS CORRESPONDENCE
    PatstatLinks = pd.read_csv(PATHS.privatedata / 'Patstat_orbis_correspondence/Orbis_PATSTAT_updatePEM_anonymized.csv')
    PatstatLinks = PatstatLinks[PatstatLinks['bvdid'].isin(list_bvdids)]
    list_appln_id = PatstatLinks['appln_id'].drop_duplicates().tolist()
    list_docdb_id = PatstatLinks['docdb_family_id'].drop_duplicates().tolist()
    LOGGER.info('{}: Nbr appln_id: {}'.format(namefile, len(list_appln_id)))
    LOGGER.info('{}: Nbr docdb_family_id: {}'.format(namefile, len(list_docdb_id)))
    return list_appln_id, list_docdb_id


def get_patent_info_for_list_of_bvdids(list_bvdids, namefile, test):
    # namefile = 'OEMs'
    # First, get a list of appln_ids and a list of docdb_ids from the orbis-patstat correspondence
    list_appln_id, list_docdb_id = get_list_of_patent_ids(list_bvdids, namefile)
    # Just to be sure, let's do a first pass through patstat to collect all the docdb ids corresponding to the appln_ids I have
    list_appln_id, list_docdb_id = collecting_patstat.collecting_ids(list_appln_id, list_docdb_id, test)
    collecting_patstat.collecting_info_on_applnids(list_appln_id, list_docdb_id, namefile, test)
    LOGGER.info("       Aggregating oems patents at family level")
    family_aggregation.aggregate_at_familylevel(namefile)
    LOGGER.info("       Collect all ipc and cpc codes of oems patents and aggregate them at family level")
    collecting_patstat.extract_and_save_cpc_ipc_codes(namefile, test)
    LOGGER.info("       Collect naces codes associated to oems patents and aggregate them at family level")
    collecting_patstat.extract_and_save_nace_codes(namefile, test)
    LOGGER.info("       Collect psn_sector of applicants associated to patents and aggregate them at family level")
    collecting_patstat.extract_and_save_psnsector(list_appln_id, namefile, test)


def get_docdbids_to_aggregate(OEMs, df_bvdid_docdbid, aggregation_level='OEM_Level1_ID', with_subsi=True):
    # Aggregate at level 1 with subsidiaries: aggregation_level='OEM_Level1_ID', with_subsi=True
    df_bvdids_fam = []
    df_bvdids_fam.append(OEMs[['Year', aggregation_level, 'Level2_bvdid']].drop_duplicates().rename(columns={'Level2_bvdid': 'bvdid'}))
    if with_subsi:
        df_bvdids_fam.append(OEMs[['Year', aggregation_level, 'Sub_BvDID']].drop_duplicates().rename(columns={'Sub_BvDID': 'bvdid'}))
    df_bvdids_fam = pd.concat(df_bvdids_fam)
    # First, we want to get the docdb ids associated to bvdids of Level2_bvdid in the relevant year
    postperiod = df_bvdids_fam.merge(df_bvdid_docdbid, left_on=['bvdid', 'Year'], right_on=['bvdid', 'earliest_filing_year'], how='inner').drop_duplicates()
    # Next: ownership data stops more or less in 2004 for level 1. and in 2007 for subsidiaries.
    # We will assume that the minimum year for which we have data represents the ownership structure of all years before that date
    subsi_preperiod = None
    if with_subsi:
        # First, do this for level 2 - subsidiaries
        subsi_preperiod = OEMs[OEMs['Year'] == 2007].dropna()
        subsi_preperiod = subsi_preperiod[['Year', aggregation_level, 'Sub_BvDID']].drop_duplicates()
        subsi_preperiod = subsi_preperiod.merge(df_bvdid_docdbid, left_on='Sub_BvDID', right_on='bvdid', how='inner')
        subsi_preperiod = subsi_preperiod[subsi_preperiod['earliest_filing_year'] < subsi_preperiod['Year']]
        subsi_preperiod['Year'] = subsi_preperiod['earliest_filing_year']
        subsi_preperiod = subsi_preperiod[['Year', aggregation_level, 'bvdid', 'docdb_family_id', 'earliest_filing_year']].drop_duplicates()
    # Next, do this for oems ids at level 1 or level 2.
    # for level 1 - level 2, data is mostly up to 2004 but sometimes it stops earlier.
    # i therefore first calculate the min year .
    minyear = OEMs.groupby(aggregation_level)['Year'].min().rename('min_year').reset_index()
    pre_period = OEMs[['Year', aggregation_level, 'Level2_bvdid']].drop_duplicates().merge(minyear, on=aggregation_level)
    pre_period = pre_period[pre_period['min_year'] == pre_period['Year']]
    pre_period = pre_period.merge(df_bvdid_docdbid, left_on='Level2_bvdid', right_on='bvdid', how='inner')
    # keep families that are only in the pre-period
    pre_period = pre_period[pre_period['earliest_filing_year'] < pre_period['Year']]
    pre_period['Year'] = pre_period['earliest_filing_year']
    pre_period = pre_period[['Year', aggregation_level, 'bvdid', 'docdb_family_id', 'earliest_filing_year']].drop_duplicates()
    df_bvdids_fam = pd.concat([postperiod, pre_period, subsi_preperiod]).drop_duplicates()
    return df_bvdids_fam


def main(test=False):
    LOGGER.info('SCRIPT: b_patent_oems.py')
    LOGGER.info("       test: {}".format(test))
    # Import data about relationship between OEMs and subsidiaries
    cols = ['Year', 'OEM_Level1_ID', 'Level1_bvdid', 'OEM_Level2_ID', 'Level2_bvdid', 'Sub_BvDID']
    OEMs = pd.read_csv(PATHS.dropbox / 'Data_outputted/A_AutoIndustry/OEM_and_Subsidiaries.csv', usecols=cols).drop_duplicates()
    LOGGER.info("       Check that all level 1 OEMs are also in level 2")
    data = OEMs[['Year', 'Level1_bvdid']].drop_duplicates()
    data = data.merge(OEMs[['Year', 'Level2_bvdid']].drop_duplicates(), left_on=['Year', 'Level1_bvdid'], right_on=['Year', 'Level2_bvdid'], indicator=True, how='outer')
    LOGGER.info(f"left is  Level1_bvdid // right is Level2_bvdid     \n{data['_merge'].value_counts()}")
    #######################################################################
    # COLLECTING PATENTS
    #######################################################################
    LOGGER.info("       Collect info about OEMs patents")
    list_bvdids = OEMs['Level2_bvdid'].unique()
    get_patent_info_for_list_of_bvdids(list_bvdids, 'OEMs', test)
    LOGGER.info("       Collect info about subsidiaries patents")
    list_bvdids = OEMs[~OEMs['Sub_BvDID'].isin(OEMs['Level2_bvdid'])]['Sub_BvDID'].dropna().drop_duplicates().tolist()
    get_patent_info_for_list_of_bvdids(list_bvdids, 'subsidiaries', test)
    ######################################################################
    # AGGREGATING
    #######################################################################
    LOGGER.info("       AGGREGATION FIRM-YEAR")
    # Import a df that has docdbids, bvdids and year info (merging patstat orbis correspondence with family files)
    df_bvdid_docdbid = firm_year_aggregation.get_docdbids_of_bvdids('OEMs')
    # Aggregate at level 1 without subsidiaries
    df_agg_corres = get_docdbids_to_aggregate(OEMs, df_bvdid_docdbid, aggregation_level='OEM_Level1_ID', with_subsi=False)
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='OEM_Level1_ID', namefile='OEMs_level1_without_subsi')
    # Aggregate at level 1 with subsidiairies
    df_agg_corres = get_docdbids_to_aggregate(OEMs, df_bvdid_docdbid, aggregation_level='OEM_Level1_ID', with_subsi=True)
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='OEM_Level1_ID', namefile='OEMs_level1_with_subsi')
    # Aggregate at level 2 without subsidiaries
    df_agg_corres = get_docdbids_to_aggregate(OEMs, df_bvdid_docdbid, aggregation_level='OEM_Level2_ID', with_subsi=False)
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='OEM_Level2_ID', namefile='OEMs_level2_without_subsi')
    # Aggregate at level 2 with subsidiaries
    df_agg_corres = get_docdbids_to_aggregate(OEMs, df_bvdid_docdbid, aggregation_level='OEM_Level2_ID', with_subsi=True)
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='OEM_Level2_ID', namefile='OEMs_level2_with_subsi')
    # Aggregate at subsidiary level
    df_agg_corres = df_bvdid_docdbid[df_bvdid_docdbid['bvdid'].isin(OEMs['Sub_BvDID'])]
    df_agg_corres['Year'] = df_agg_corres['earliest_filing_year']
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='bvdid', namefile='subsidiaries')
    LOGGER.info('SCRIPT END: b_patent_oems.py')

